{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-05T00:11:56.911705Z",
     "start_time": "2019-05-05T00:11:55.534886Z"
    }
   },
   "outputs": [],
   "source": [
    "from IPython.display import display\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib as mpl\n",
    "import warnings\n",
    "import time\n",
    "import json\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "\n",
    "plt.style.use(\"fivethirtyeight\")\n",
    "\n",
    "sns.set_style({'font.sans-serif': ['simsun', 'Arial']})\n",
    "sns.set_style('darkgrid', {'font.sans-serif': ['simhei', 'Arial']})\n",
    "%matplotlib inline\n",
    "\n",
    "holidays = pd.Series(json.load(open(r'../../Data/Holidays.json'))['holidays'])\n",
    "subwayinfo = pd.read_csv(r'../../Data/BeijingSubway.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-05T00:11:58.734272Z",
     "start_time": "2019-05-05T00:11:58.135740Z"
    }
   },
   "outputs": [],
   "source": [
    "df_copy = pd.read_csv(\n",
    "    r'../../Data/data_set_phase1/train_queries.csv')\n",
    "df = df_copy.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-05T00:12:08.822281Z",
     "start_time": "2019-05-05T00:12:04.414620Z"
    }
   },
   "outputs": [],
   "source": [
    "df['req_time'] = pd.to_datetime(df['req_time'])\n",
    "df['day_of_week'] = df['req_time'].dt.day_name()\n",
    "df['req_date'] = df['req_time'].dt.strftime('%m-%d')\n",
    "df['req_hour'] = df['req_time'].dt.hour\n",
    "df['req_minute'] = df['req_time'].dt.minute\n",
    "df['if_holiday'] = (df['req_date'].isin(holidays)).astype(int)\n",
    "\n",
    "# Week day\n",
    "current_c = list(df.columns.values)\n",
    "weekdays = ['Monday', 'Tuesday', 'Wednesday',\n",
    "            'Thursday', 'Friday', 'Saturday', 'Sunday']\n",
    "current_c.extend(weekdays)\n",
    "df = df.reindex(columns=current_c, fill_value=0)\n",
    "for day_name in weekdays:\n",
    "    df.loc[(df['day_of_week'] == day_name), [day_name]] = 1\n",
    "\n",
    "# Hour\n",
    "current_c = list(df.columns.values)\n",
    "oclock = [str(i)+'_oclock' for i in range(0, 24)]\n",
    "current_c.extend(oclock)\n",
    "df = df.reindex(columns=current_c, fill_value=0)\n",
    "for h in oclock:\n",
    "    df.loc[(df['req_hour'] == int(h.split('_')[0])), [h]] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-05T00:15:46.326181Z",
     "start_time": "2019-05-05T00:15:41.997318Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "date Done.\n"
     ]
    }
   ],
   "source": [
    "df['date_d'] = df['req_time'].dt.strftime('%d').astype(int)\n",
    "\n",
    "\n",
    "current_c = list(df.columns.values)\n",
    "date = [str(i)+'_date' for i in range(1, 32)]\n",
    "current_c.extend(date)\n",
    "df = df.reindex(columns=current_c, fill_value=0)\n",
    "for d in date:\n",
    "    df.loc[(df['date_d'] == int(d.split('_')[0])), [d]] = 1\n",
    "print('date Done.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-05-05T00:15:53.918503Z",
     "start_time": "2019-05-05T00:15:53.892303Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sid</th>\n",
       "      <th>pid</th>\n",
       "      <th>req_time</th>\n",
       "      <th>o</th>\n",
       "      <th>d</th>\n",
       "      <th>day_of_week</th>\n",
       "      <th>req_date</th>\n",
       "      <th>req_hour</th>\n",
       "      <th>req_minute</th>\n",
       "      <th>if_holiday</th>\n",
       "      <th>...</th>\n",
       "      <th>22_date</th>\n",
       "      <th>23_date</th>\n",
       "      <th>24_date</th>\n",
       "      <th>25_date</th>\n",
       "      <th>26_date</th>\n",
       "      <th>27_date</th>\n",
       "      <th>28_date</th>\n",
       "      <th>29_date</th>\n",
       "      <th>30_date</th>\n",
       "      <th>31_date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3000821</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-11-02 17:54:30</td>\n",
       "      <td>116.29,39.97</td>\n",
       "      <td>116.32,39.96</td>\n",
       "      <td>Friday</td>\n",
       "      <td>11-02</td>\n",
       "      <td>17</td>\n",
       "      <td>54</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3085857</td>\n",
       "      <td>210736.0</td>\n",
       "      <td>2018-11-16 10:53:10</td>\n",
       "      <td>116.39,39.84</td>\n",
       "      <td>116.33,39.79</td>\n",
       "      <td>Friday</td>\n",
       "      <td>11-16</td>\n",
       "      <td>10</td>\n",
       "      <td>53</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2944522</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-10-06 10:33:58</td>\n",
       "      <td>116.31,39.93</td>\n",
       "      <td>116.27,40.00</td>\n",
       "      <td>Saturday</td>\n",
       "      <td>10-06</td>\n",
       "      <td>10</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>559931</td>\n",
       "      <td>202427.0</td>\n",
       "      <td>2018-11-23 14:54:11</td>\n",
       "      <td>116.27,39.88</td>\n",
       "      <td>116.39,39.90</td>\n",
       "      <td>Friday</td>\n",
       "      <td>11-23</td>\n",
       "      <td>14</td>\n",
       "      <td>54</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2819352</td>\n",
       "      <td>172251.0</td>\n",
       "      <td>2018-10-30 11:48:41</td>\n",
       "      <td>116.34,39.96</td>\n",
       "      <td>116.37,39.86</td>\n",
       "      <td>Tuesday</td>\n",
       "      <td>10-30</td>\n",
       "      <td>11</td>\n",
       "      <td>48</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 73 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       sid       pid            req_time             o             d  \\\n",
       "0  3000821       NaN 2018-11-02 17:54:30  116.29,39.97  116.32,39.96   \n",
       "1  3085857  210736.0 2018-11-16 10:53:10  116.39,39.84  116.33,39.79   \n",
       "2  2944522       NaN 2018-10-06 10:33:58  116.31,39.93  116.27,40.00   \n",
       "3   559931  202427.0 2018-11-23 14:54:11  116.27,39.88  116.39,39.90   \n",
       "4  2819352  172251.0 2018-10-30 11:48:41  116.34,39.96  116.37,39.86   \n",
       "\n",
       "  day_of_week req_date  req_hour  req_minute  if_holiday  ...  22_date  \\\n",
       "0      Friday    11-02        17          54           0  ...        0   \n",
       "1      Friday    11-16        10          53           0  ...        0   \n",
       "2    Saturday    10-06        10          33           1  ...        0   \n",
       "3      Friday    11-23        14          54           0  ...        0   \n",
       "4     Tuesday    10-30        11          48           0  ...        0   \n",
       "\n",
       "   23_date  24_date  25_date  26_date  27_date  28_date  29_date  30_date  \\\n",
       "0        0        0        0        0        0        0        0        0   \n",
       "1        0        0        0        0        0        0        0        0   \n",
       "2        0        0        0        0        0        0        0        0   \n",
       "3        1        0        0        0        0        0        0        0   \n",
       "4        0        0        0        0        0        0        0        1   \n",
       "\n",
       "   31_date  \n",
       "0        0  \n",
       "1        0  \n",
       "2        0  \n",
       "3        0  \n",
       "4        0  \n",
       "\n",
       "[5 rows x 73 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2019-05-01T13:33:52.620Z"
    }
   },
   "outputs": [],
   "source": [
    "df.drop(columns=['day_of_week','req_date', 'req_hour'], inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2019-05-01T13:33:52.621Z"
    }
   },
   "outputs": [],
   "source": [
    "df['o_lng'] = df['o'].apply(lambda x: float(x.split(',')[0]))\n",
    "df['o_lat'] = df['o'].apply(lambda x: float(x.split(',')[1]))\n",
    "df['d_lng'] = df['d'].apply(lambda x: float(x.split(',')[0]))\n",
    "df['d_lat'] = df['d'].apply(lambda x: float(x.split(',')[1]))\n",
    "df['od_manhattan_distance'] = abs(\n",
    "    df['o_lng']-df['d_lng'])+abs(df['o_lat']-df['d_lat'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2019-05-01T13:33:52.623Z"
    }
   },
   "outputs": [],
   "source": [
    "df['o_nearest_dis'] = np.nan\n",
    "df['d_nearest_dis'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2019-05-01T13:33:52.624Z"
    }
   },
   "outputs": [],
   "source": [
    "for index in df.index:\n",
    "    df[index, 'o_nearest_dis'] = (abs(subwayinfo['station_longitude']-df.loc[index]\n",
    "                                      ['o_lng'])+abs(subwayinfo['station_latitude']-df.loc[index]['o_lat'])).min()\n",
    "    df[index, 'd_nearest_dis'] = (abs(subwayinfo['station_longitude']-df.loc[index]\n",
    "                                      ['d_lng'])+abs(subwayinfo['station_latitude']-df.loc[index]['d_lat'])).min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2019-05-01T13:33:52.626Z"
    }
   },
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
